In [1]:
import os
import sys
In [2]:
import pandas
In [3]:
sys.path.append(os.path.join(os.path.abspath('../..'), 'src'))
In [4]:
import utils
In [5]:
with open('/home/immersinn/Dropbox/Analytics/NCGA/PACDocs/NC Campaign Document Search By Type-2014.html') as f:
html = f.read()
In [6]:
from bs4 import BeautifulSoup as bs
In [7]:
soup = bs(html, 'html.parser')
In [8]:
tables = soup.find_all('table')
In [9]:
len(tables)
Out[9]:
In [10]:
table = tables[0]
In [11]:
rows = table.find_all('tr')
In [12]:
len(rows)
Out[12]:
In [13]:
header = rows[0]
col_names = [h.text for h in header.find_all('td')]
col_names
Out[13]:
In [14]:
sub_info = rows[1:100]
In [15]:
sub_info[0].find_all('td')
Out[15]:
In [16]:
for cn,col in zip(col_names, sub_info[3].find_all('td')):
a = col.find('a')
if a:
try:
print(cn + ' - ' + col.text + ': ' + col.find('a')['href'])
except KeyError:
print(cn + ' - ' + 'NO DATA LINK')
else:
print(cn + ' - ' + col.text)
In [17]:
def process_table(table, limit=100):
def process_row(row):
data = {}
for cn, col in zip(col_names, row.find_all('td')):
a = col.find('a')
if a:
try:
data[cn] = a['href']
except KeyError:
data[cn] = ''
else:
data[cn] = col.text.strip()
return(data)
rows = table.find_all('tr')
col_names = [h.text for h in rows[0].find_all('td')]
data = [process_row(row) for row in rows[1:limit]]
df = pandas.DataFrame(data)
df = df[col_names]
return(df)
In [18]:
df = process_table(table, limit=100)
In [42]:
df.shape
Out[42]:
In [20]:
df.head()
Out[20]:
In [21]:
df['Report Type'].unique()
Out[21]:
In [37]:
reload(process_NCCampaignDocument_data)
Out[37]:
In [31]:
import process_NCCampaignDocument_data
In [65]:
df2015 = process_NCCampaignDocument_data.lae_nccd('2015')
In [66]:
df2015.shape
Out[66]:
In [67]:
df2015.head()
Out[67]:
In [68]:
df2015.tail()
Out[68]:
In [70]:
df2015[df2015.data_link=='MISSING DATA']
Out[70]:
In [46]:
df2015[''].apply(len).describe()
Out[46]:
In [50]:
df2015['Received Date'][0].split('/')
Out[50]:
In [51]:
df2014.columns
Out[51]:
In [52]:
col_mapping = {'Committee Name' : 'committee',
'Report Year' : 'report_year',
'Report Type' : 'report_type',
'Amend' : 'amend',
'Received Date' : 'rec_date',
'Start Date' : 'start_date',
'End Date' : 'end_date',
'Image' : 'image_link',
'Data' : 'data_link',
}
In [53]:
df2014.columns = [col_mapping[c] for c in df2015.columns]
In [54]:
df2015.head()
Out[54]:
In [96]:
import init_NCCampaignDocument_table
In [139]:
reload(init_NCCampaignDocument_table)
Out[139]:
In [143]:
df2014 = init_NCCampaignDocument_table.lae_nccd('2014')
In [144]:
df2014.shape
Out[144]:
In [145]:
df2014.head()
Out[145]:
In [146]:
df2014.tail()
Out[146]:
In [147]:
df2014.committee.apply(len).describe()
Out[147]:
In [149]:
df2014.image_link.apply(len).describe()
Out[149]:
In [150]:
df2014.data_link.apply(len).describe()
Out[150]:
In [151]:
df2014.rec_date.describe()
Out[151]:
In [3]:
with open('/home/immersinn/gits/ncga/references/report_type_names.txt', 'r') as f:
rtn = f.readlines()
In [4]:
def process_rtn_entry(ent):
ent = ent.strip('\n')
ent = ent.split(' - ')[1]
ent = ent.split(' (')[0]
return(ent)
In [5]:
rtns = list(set([process_rtn_entry(r) for r in rtn]))
rtns = sorted(rtns)
rtns
Out[5]:
In [9]:
','.join(["'" + r + "'" for r in rtns])
Out[9]:
In [164]:
sorted(dfall.report_type.unique())
Out[164]: